library(devtools)
library(rgdal)
library(GGally)
library(ggplot2)
library(plotly)
library(scales)
library(ggthemes)
library(RColorBrewer)
library(viridis)
library(grid)
library(gridExtra)
library(ggimage)
library(png)
library(gridGraphics)
library(dplyr)
library(tidyr)
library(forcats)
#devtools::install_github('bart6114/artyfarty')
library('artyfarty')
library(tm)
library(wordcloud)
location = "C:/Users/hrli1/Desktop/2018 Spring/EDA/NBA/NBA-Visualization-/NBA_data/"
clutch = read.csv(paste(location, 'fetched.csv', sep=""))
#number of games played vs number of wins
df1 = clutch[,c('GP','W','team')]
df1= gather(df1,type,count,-team)
#df1$count <- ifelse(df1$type =="W",df1$count*(-1),df1$count)
temp = df1[df1$type=='GP',]
new_levels= as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
#df1 <- within(df1, team <- factor(team, levels=names(sort(count, decreasing=TRUE))))
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
geom_bar(stat="identity",position="identity")+
xlab("number of games")+ylab("name of teams")+
scale_fill_manual(name="type of games",values = pal("five38"))+
coord_flip()+ggtitle("number of games played (GP) v.s number of wins (W)")+
geom_hline(yintercept=0)+
ylab("number of games")+
xlab("team name")+
scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
theme_scientific()
#Personal fouls (PF) and turnovers (TOV)
df1 = clutch[,c('PF','TOV','team')]
df1= gather(df1,type,count,-team)
df1$count <- ifelse(df1$type =="PF",df1$count*(-1),df1$count)
temp = temp = df1[df1$type=='TOV',]
new_levels= as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
#df1 <- within(df1, team <- factor(team, levels=names(sort(count, decreasing=TRUE))))
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
geom_bar(stat="identity",position="identity")+
xlab("counts")+ylab("name of teams")+
scale_fill_manual(values = pal("five38"))+
coord_flip()+ggtitle("Personal fouls (PF) and turnovers (TOV)")+
geom_hline(yintercept=0)+
ylab("counts")+
xlab("team name")+
scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
theme_scientific()
# divergent plot
df1 = clutch[,c('PCT_PTS_2PT','PCT_PTS_3PT','PCT_PTS_FT','team')]
df1= gather(df1,type,count,-team)
temp = df1[df1$type=='PCT_PTS_2PT',]
new_levels= as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
df1$count <- ifelse(df1$type =="PCT_PTS_2PT",df1$count*(-1),df1$count)
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
geom_col()+
xlab("percentage")+ylab("name of teams")+
scale_fill_manual(values = pal("five38"))+
coord_flip()+ggtitle("2PT%,3PT%,FT%")+
geom_hline(yintercept=0)+
ylab("percentage")+
xlab("team name")+
scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
theme_scientific()
path = 'https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/'
#path = 'https://github.com/lihaoranharry/NBA-Visualization-/tree/master/NBA_data/clutch_team/logo/'
#img <- "https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/ATL.png?raw=true"
df1 = clutch[,c('OFF_RATING','DEF_RATING','team')]
df1$img = paste(path,df1$team,'.png?raw=true',sep='')
ggplot(df1,aes(x=OFF_RATING,y=DEF_RATING))+geom_point()+
scale_y_reverse()+geom_image(image = df1$img, size = .05)+
theme_scientific()+
ggtitle("offensive rating v.s. defensive rating")+
xlab('offensive rating')+ylab('defensive rating')
### Part 2
## Preprocess data to merge with the team
df_name_team = read.csv(paste(location, 'Name_Team.csv', sep=""))
df_name_team = df_name_team[,c("PERSON_ID","Team_Name")]
colnames(df_name_team)[1] = "player_id"
df_name_team_abbr = read.csv(paste(location, 'abbr_team.csv', sep=""))
my_read = function(path,team=df_name_team){
temp = read.csv(file=path)
final = merge(temp,team,by = "player_id",all=TRUE)
#final$Abbri = df_name_team_abbr
return(final[ ,!(colnames(final) == "X")])
}
df_3pct = my_read(path = paste(location,'3pct_df.csv', sep=""))
df_3fgm = my_read(path = paste(location,'3fgm_df.csv', sep=""))
df_3 = merge(df_3fgm,df_3pct,by = "player_id",all=TRUE)
df_pct = my_read(path = paste(location,'pct_df.csv', sep=""))
df_fgm = my_read(path = paste(location,'fgm_df.csv', sep=""))
df_all = merge(df_fgm,df_pct,by = "player_id",all=TRUE)
df_pts = my_read(path = paste(location,'pts_df.csv', sep=""))
df_fta = my_read(path = paste(location,'fta_df.csv', sep=""))
df_fct = my_read(path = paste(location,'fct_df.csv', sep=""))
df_ftm = my_read(path = paste(location,'ftm_df.csv', sep=""))
# Define FGA: Field Goal Attempt
FGA = df_fgm$overall / df_fct$overall
# Define TSP: True shooting percent
TSP = df_pts$overall/(2*(FGA+0.44*df_fta$overall))
df_pts['TSP'] = TSP
# Make a copy of df_pts
df_pts_v1 = df_pts
# Subset to remove all the NAs due to players that did not have a team or did not play in 2016
df_pts_v1_2 = df_pts_v1[!is.na(df_pts_v1$TSP),]
##==================================================================
#Plot on whole data, all teams
p_TSP = ggplot(df_pts_v1_2)+
geom_point(aes(overall,TSP,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "TSP V.S PTS Facet on Team",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP)
p_TSP_All = ggplot(df_pts_v1_2)+
geom_point(aes(overall,TSP,color = player_name,shape = Team_Name),size = 2)+
labs(title = "TSP V.S PTS All Star",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP_All)
##==================================================================
#Plot on Top4 Last4
TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs","Lakers","Suns","76ers","Nets")
TopLowP_TSP = df_pts_v1_2[df_pts_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_TSP = ggplot(TopLowP_TSP)+
geom_point(aes(overall,TSP,color = player_name,shape=Rank),size = 2)+
facet_wrap(~Team_Name)+
labs(title = "TSP V.S PTS Facet on Top4Last4",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP)
p_TSP_All = ggplot(TopLowP_TSP)+
geom_point(aes(overall,TSP,color = player_name,shape = Rank),size = 2)+
labs(title = "TSP V.S PTS All Star Top4Last4 ",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP_All)
#=++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Define FGA: Field Goal Attempt on X5min_plusminus_5
FGA = df_fgm$X5min_plusminus_5 / df_fct$X5min_plusminus_5
# Define TSP: True shooting percent
TSP = df_pts$X5min_plusminus_5/(2*(FGA+0.44*df_fta$X5min_plusminus_5))
df_pts['TSP'] = TSP
# Make a copy of df_pts
df_pts_v1 = df_pts
# Subset to remove all the NAs due to players that did not have a team or did not play in 2016
df_pts_v1_2 = df_pts_v1[!is.na(df_pts_v1$TSP),]
##==================================================================
#Plot on whole data, all teams
p_TSP = ggplot(df_pts_v1_2)+
geom_point(aes(X5min_plusminus_5,TSP,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "TSP V.S PTS Facet on Team",x = 'X5min_plusminus_5 PTS', y='X5min_plusminus_5 TSP')
ggplotly(p_TSP)
p_TSP_All = ggplot(df_pts_v1_2)+
geom_point(aes(X5min_plusminus_5,TSP,color = player_name,shape = Team_Name),size = 2)+
labs(title = "TSP V.S PTS All Star",x = 'X5min_plusminus_5 PTS', y='X5min_plusminus_5 TSP')
ggplotly(p_TSP_All)
##==================================================================
#Plot on Top4 Last4
TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs","Lakers","Suns","76ers","Nets")
TopLowP_TSP = df_pts_v1_2[df_pts_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_TSP = ggplot(TopLowP_TSP)+
geom_point(aes(X5min_plusminus_5,TSP,color = player_name,shape=Rank),size = 2)+
facet_wrap(~Team_Name)+
labs(title = "TSP V.S PTS Facet on X5min_plusminus_5 Top4Last4",x = 'X5min_plusminus_5 PTS', y='X5min_plusminus_5 TSP')
ggplotly(p_TSP)
p_TSP_All = ggplot(TopLowP_TSP)+
geom_point(aes(X5min_plusminus_5,TSP,color = player_name,shape = Rank),size = 2)+
labs(title = "TSP V.S PTS All Star X5min_plusminus_5 Top4Last4 ",x = 'X5min_plusminus_5 PTS', y='X5min_plusminus_5 TSP')
ggplotly(p_TSP_All)
##==================================================================
#Plot on ALL
df_pct['df_fgm_overall']=df_fgm$overall
df_pct_v1 = df_pct
df_pct_v1_2 = df_pct_v1[!is.na(df_fgm$player_name),]
p_FGMPCT = ggplot(df_pct_v1_2)+
geom_point(aes(df_fgm_overall,overall,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "pct_overall V.S fgm_overall ",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT)
p_FGMPCT_All = ggplot(df_pct_v1_2)+
geom_point(aes(df_fgm_overall,overall,color = player_name,shape = Team_Name),size = 2)+
labs(title = "pct_overall V.S fgm_overall ",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT_All)
##==================================================================
#Plot on Top4 Last4
df_pct['df_fgm_overall']=df_fgm$overall
df_pct_v1 = df_pct
df_pct_v1_2 = df_pct_v1[!is.na(df_fgm$player_name),]
TopLowP_TSP = df_pct_v1_2[df_pct_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_FGMPCT = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,overall,color = player_name,shape=Rank),size = 2)+
facet_wrap(~Team_Name)+
labs(title = "pct_overall V.S fgm_overall Facet on Top4Last4",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT)
p_FGMPCT_All = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,overall,color = player_name,shape =Rank),size = 2)+
labs(title = "pct_overall V.S fgm_overall All Star Top4Last4 ",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT_All)
#=++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
##==================================================================
#Plot on Top4 Last4
df_pct['df_fgm_overall']=df_fgm$X10sec_down_3
df_pct_v1 = df_pct
df_pct_v1_2 = df_pct_v1[!is.na(df_fgm$player_name),]
TopLowP_TSP = df_pct_v1_2[df_pct_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_FGMPCT = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,X10sec_down_3,color = player_name,shape=Rank),size = 2)+
facet_wrap(~Team_Name)+
labs(title = "pct_X10sec_down_3 V.S fgm_X10sec_down_3 Facet on Top4Last4",x = 'fgm_X10sec_down_3', y='pct_X10sec_down_3')
ggplotly(p_FGMPCT)
p_FGMPCT_All = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,X10sec_down_3,color = player_name,shape =Rank),size = 2)+
labs(title = "pct_X10sec_down_3 V.S fgm_X10sec_down_3 All Star Top4Last4 ",x = 'fgm_X10sec_down_3', y='pct_X10sec_down_3')
ggplotly(p_FGMPCT_All)
##=============
df_pct['df_fgm_overall']=df_fgm$X30sec_plusminus_5
df_pct_v1 = df_pct
df_pct_v1_2 = df_pct_v1[!is.na(df_fgm$player_name),]
TopLowP_TSP = df_pct_v1_2[df_pct_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_FGMPCT = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,X30sec_plusminus_5,color = player_name,shape=Rank),size = 2)+
facet_wrap(~Team_Name)+
labs(title = "pct_X30sec_plusminus_5 V.S fgm_X30sec_plusminus_5 Facet on Top4Last4",x = 'fgm_X30sec_plusminus_5', y='pct_X30sec_plusminus_5')
ggplotly(p_FGMPCT)
p_FGMPCT_All = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,X30sec_plusminus_5,color = player_name,shape =Rank),size = 2)+
labs(title = "pct_X30sec_plusminus_5 V.S fgm_X30sec_plusminus_5 All Star Top4Last4 ",x = 'fgm_X30sec_plusminus_5', y='pct_X30sec_plusminus_5')
ggplotly(p_FGMPCT_All)
##=============
df_pct['df_fgm_overall']=df_fgm$X10sec_down_3
df_pct_v1 = df_pct
df_pct_v1_2 = df_pct_v1[!is.na(df_fgm$player_name),]
TopLowP_TSP = df_pct_v1_2[df_pct_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_FGMPCT = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,X10sec_down_3,color = player_name,shape=Rank),size = 2)+
facet_wrap(~Team_Name)+
labs(title = "pct_X10sec_down_3 V.S fgm_X10sec_down_3 Facet on Top4Last4",x = 'fgm_X10sec_down_3', y='pct_X10sec_down_3')
ggplotly(p_FGMPCT)
p_FGMPCT_All = ggplot(TopLowP_TSP)+
geom_point(aes(df_fgm_overall,X10sec_down_3,color = player_name,shape =Rank),size = 2)+
labs(title = "pct_X10sec_down_3 V.S fgm_X10sec_down_3 All Star Top4Last4 ",x = 'fgm_X10sec_down_3', y='pct_X10sec_down_3')
ggplotly(p_FGMPCT_All)
##==================================================================
#Plot on All Team
df_3pct['df_3fgm_overall']=df_3fgm$overall
df_pct3_v1 = df_3pct
df_pct3_v1_2 = df_pct3_v1[!is.na(df_3fgm$player_name),]
p_3FGM3PCT = ggplot(df_pct3_v1_2)+
geom_point(aes(df_3fgm_overall,overall,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "3pct_overall V.S 3fgm_overall ",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT)
p_3FGM3PCT_All = ggplot(df_pct3_v1_2)+
geom_point(aes(df_3fgm_overall,overall,color = player_name,shape = Team_Name),size = 2)+
labs(title = "3pct_overall V.S 3fgm_overall ",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT_All)
##==================================================================
#Plot on Top4 Last4
df_3pct['df_3fgm_overall']=df_3fgm$overall
df_pct3_v1 = df_3pct
df_pct3_v1_2 = df_pct3_v1[!is.na(df_3fgm$player_name),]
TopLowP_TSP = df_pct3_v1_2[df_pct3_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_3FGM3PCT = ggplot(TopLowP_TSP)+
geom_point(aes(df_3fgm_overall,overall,color = player_name,shape=Rank),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "3pct_overall V.S 3fgm_overall Facet on Top4Last4",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT)
p_3FGM3PCT_All = ggplot(TopLowP_TSP)+
geom_point(aes(df_3fgm_overall,overall,color = player_name,shape=Rank),size = 2)+
labs(title = "3pct_overall V.S 3fgm_overall All Star Top4Last4",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT_All)
##==================================================================
#Plot on All teams
df_fta['df_ftm_30sec_plusmiuns_5'] = df_ftm$X30sec_plusminus_5
df_fta_v1 = df_fta
df_fta_v1_2 = df_fta_v1[!is.na(df_fta$player_name),]
p_fta_ftm = ggplot(df_fta_v1_2)+
geom_point(aes(X30sec_plusminus_5,df_ftm_30sec_plusmiuns_5,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(df_fta_v1_2)+
geom_point(aes(X30sec_plusminus_5,
df_ftm_30sec_plusmiuns_5,
color = player_name,
shape=Team_Name),
size = 1.3,
alpha=0.5)+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(df_fta_v1_2)+
geom_point(aes(X30sec_plusminus_5,
df_ftm_30sec_plusmiuns_5,
color = player_name,
shape=Team_Name),
size = 1.3,
alpha=0.5,
position = "jitter")+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
##==================================================================
#Plot on Top4 Last4
df_fta['df_ftm_30sec_plusmiuns_5'] = df_ftm$X30sec_plusminus_5
df_fta_v1 = df_fta
df_fta_v1_2 = df_fta_v1[!is.na(df_fta$player_name),]
TopLowP_TSP = df_fta_v1_2[df_fta_v1_2$Team_Name %in% TopLowTeam,]
TopLowP_TSP['Rank'] = ifelse(TopLowP_TSP$Team_Name %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
p_fta_ftm = ggplot(TopLowP_TSP)+
geom_point(aes(X30sec_plusminus_5,df_ftm_30sec_plusmiuns_5,color = player_name, shape=Rank),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 Facet on Top4Last4",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(TopLowP_TSP)+
geom_point(aes(X30sec_plusminus_5,
df_ftm_30sec_plusmiuns_5,
color = player_name,
shape=Rank),
size = 1.3,
alpha=0.5)+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 All Star Top4Last4",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(TopLowP_TSP)+
geom_point(aes(X30sec_plusminus_5,
df_ftm_30sec_plusmiuns_5,
color = player_name,
shape=Rank),
size = 1.3,
alpha=0.5,
position = "jitter")+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 All Star Top4Last4",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
# average within group 3point
cbP = c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
df_3fgm_sum = aggregate(df_3fgm[,3:12], list(df_3fgm$Team_Name), sum, na.rm = TRUE)
deno = df_3fgm/df_3pct[,1:13]
deno$player_name = df_3fgm$player_name
deno$player_id = df_3fgm$player_id
deno$Team_Name = df_3fgm$Team_Name
deno_modi = aggregate(deno[,3:12], list(deno$Team_Name), sum, na.rm = TRUE)
average3point = df_3fgm_sum/deno_modi
average3point$Group.1=deno_modi$Group.1
average3point[is.na(average3point)] = 0
TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs",
"Lakers","Suns","76ers","Nets")
TopLow3point = average3point[average3point$Group.1 %in% TopLowTeam,]
RK = ifelse(TopLow3point$Group.1 %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
TopLow3point['TRk']= RK
#TopLow3point
p1 = ggparcoord(data = TopLow3point,
columns =2:7,
mapping=aes(color=as.factor(Group.1),
linetype = as.factor(TRk)),
scale = 'globalminmax'
)+
scale_linetype_discrete("Rank",
labels=TopLow3point$TRk)+
#scale_color_discrete("Team",
# labels=TopLow3point$Group.1)+
geom_vline(xintercept = 0:6, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average 3PT Last Xmin yDown Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind("Team",
labels=TopLow3point$Group.1)
p2 = ggparcoord(data = TopLow3point,
columns =c(2,8:11),
mapping=aes(color=as.factor(Group.1),
linetype = as.factor(TRk)),
scale = 'globalminmax'
)+
scale_linetype_discrete("Rank",
labels=TopLow3point$TRk)+
#scale_color_discrete("Team",
# labels=TopLow3point$Group.1)+
geom_vline(xintercept = 0:6, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average 3PT Last Xmin yDownorHiger Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind("Team",
labels=TopLow3point$Group.1)
# average within group all point
cbP = c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
df_fgm_sum = aggregate(df_fgm[,3:12], list(df_fgm$Team_Name), sum, na.rm = TRUE)
deno = df_fgm/df_pct[,1:13]
deno$player_name = df_fgm$player_name
deno$player_id = df_fgm$player_id
deno$Team_Name = df_fgm$Team_Name
deno_modi = aggregate(deno[,3:12], list(deno$Team_Name), sum, na.rm = TRUE)
averagepoint = df_fgm_sum/deno_modi
averagepoint$Group.1=deno_modi$Group.1
averagepoint[is.na(averagepoint)] = 0
TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs",
"Lakers","Suns","76ers","Nets")
TopLowpoint = averagepoint[averagepoint$Group.1 %in% TopLowTeam,]
RK = ifelse(TopLowpoint$Group.1 %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
TopLowpoint['TRk']= RK
#averagepoint
p3 = ggparcoord(data = TopLowpoint,
columns =2:7,
mapping=aes(color=as.factor(Group.1),
linetype = as.factor(TRk)),
scale = 'globalminmax'
)+
scale_linetype_discrete("Rank",
labels=TopLow3point$TRk)+
#scale_color_discrete("Team",
# labels=TopLow3point$Group.1)+
geom_vline(xintercept = 0:6, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average TotalPT Last Xmin yDown Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind("Team",
labels=TopLowpoint$Group.1)
p4 = ggparcoord(data = TopLowpoint,
columns =c(2,8:11),
mapping=aes(color=as.factor(Group.1),
linetype = as.factor(TRk)),
scale = 'globalminmax'
)+
scale_linetype_discrete("Rank",
labels=TopLow3point$TRk)+
#scale_color_discrete("Team",
# labels=TopLow3point$Group.1)+
geom_vline(xintercept = 0:6, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average TotalPT Last Xmin yDownorHiger Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind("Team",
labels=TopLowpoint$Group.1)
grid.arrange(p1, p2, p3, p4, nrow = 2)
##==================================================================
#Plot on All Teams
averagepoint=averagepoint[2:31,]
averagepoint['abbr'] = df_name_team_abbr[,1]
average3point=average3point[2:31,]
average3point['abbr'] = df_name_team_abbr[,1]
path = 'https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/'
averagepoint$img = paste(path,averagepoint$abbr,'.png?raw=true',sep='')
average3point$img = paste(path,average3point$abbr,'.png?raw=true',sep='')
p1 = ggplot(averagepoint,aes(overall,X10sec_down_3))+
geom_point()+
geom_image(image = averagepoint$img, size = .05)+
theme_scientific()+
labs(title = "3pt Average 10sec_down_3 v.s. Overall",x = 'Overall', y='X10sec_down_3')
p2 = ggplot(average3point,aes(overall,X10sec_down_3))+
geom_point()+
geom_image(image = average3point$img, size = .05)+
theme_scientific()+
labs(title = "Total Average X10sec_down_3 v.s. Overall",x = 'Overall', y='X10sec_down_3')
grid.arrange(p1, p2, nrow = 1)
##==================================================================
#Plot on Top4 Last4
TopLowP_TSP_1 = averagepoint[averagepoint$Group.1 %in% TopLowTeam,]
TopLowP_TSP_2 = average3point[average3point$Group.1 %in% TopLowTeam,]
p3 = ggplot(TopLowP_TSP_1,aes(overall,X10sec_down_3))+
geom_point()+
geom_image(image = TopLowP_TSP_1$img,
size = .05)+
theme_scientific()+
labs(title = "3pt Average 10sec_down_3 v.s. Overall TopDown4",x = 'Overall', y='X10sec_down_3')
p4 = ggplot(TopLowP_TSP_2,aes(overall,X10sec_down_3))+
geom_point()+
geom_image(image = TopLowP_TSP_2$img,
size = .05)+
theme_scientific()+
labs(title = "Total Average X10sec_down_3 v.s. Overall TopDown4",x = 'Overall', y='X10sec_down_3')
grid.arrange(p3, p4, nrow = 1)
##==================================================================
#Every Team
ggplot()+
geom_point(data =df_pct,
aes(x = X1min_down_5, y= overall),
position = position_jitter(w = 0.01, h = 0.02),
alpha = 0.5,
size = 3)+
facet_wrap(~Team_Name)+
labs(title = "overall V.S X1min_down_5",
x = 'X1min_down_5',
y='overall')
##==================================================================
#Plot on Top4 Last4
TopLowP_TSP_1 = df_pct[df_pct$Team_Name %in% TopLowTeam,]
ggplot()+
geom_point(data =TopLowP_TSP_1,
aes(x = X1min_down_5, y= overall),
position = position_jitter(w = 0.01, h = 0.02),
alpha = 0.5,
size = 3)+
facet_wrap(~Team_Name)+
labs(title = "overall V.S X1min_down_5",
x = 'X1min_down_5',
y='overall')
##==================================================================
#Plot on everyteam
pp = ggplot()+
geom_point(data =df_all,
aes(x = X5min_plusminus_5.x, y= X5min_plusminus_5.y,color = player_name.x),
position = position_jitter(w = 0.01, h = 0.02),
alpha = 0.5,
size = 2)+
facet_wrap(~Team_Name.x)+
labs(title = "5min_plusminus_5_percent V.S X5min_plusminus_5_actual",
x = 'X5min_plusminus_5_actual',
y='5min_plusminus_5_percent')
ggplotly(pp)
##==================================================================
#Plot on Top4 Last4
TopLowP_TSP_1 = df_all[df_all$Team_Name.y %in% TopLowTeam,]
TopLowP_TSP_1['Rank'] = ifelse(TopLowP_TSP_1$Team_Name.y %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
pp = ggplot()+
geom_point(data =TopLowP_TSP_1,
aes(x = X5min_plusminus_5.x,
y= X5min_plusminus_5.y,
color = player_name.x,
shape = Rank),
position = position_jitter(w = 0.01, h = 0.02),
alpha = 0.5,
size = 2)+
facet_wrap(~Team_Name.x)+
labs(title = "5min_plusminus_5_percent V.S X5min_plusminus_5_actual",
x = 'X5min_plusminus_5_actual',
y='5min_plusminus_5_percent')
ggplotly(pp)
pairs(df_all[c("X10sec_down_3.x","X10sec_down_3.y","X30sec_down_3.x","X30sec_down_3.y")])
#df_all
pairs(df_all[c("X1min_down_5.x","X1min_down_5.y",
"X3min._down_5.x","X3min._down_5.y",
"X5min._down_5.x","X5min._down_5.y")])
#df_all
pairs(df_all[c("X30sec_plusminus_5.x","X30sec_plusminus_5.y",
"X1min_plusminus_5.x","X1min_plusminus_5.y",
"X3min_plusminus_5.x","X3min_plusminus_5.y")])
##==================================================================
#Plot on All team
df_all$Team_Name.x = as.factor(df_all$Team_Name.x)
countorder = df_all %>% group_by(Team_Name.x) %>% summarize(av=mean(overall.x, na.rm=TRUE))
#df_all = merge(df_fgm,df_pct,by = "player_id",all=TRUE)
ggplot(countorder, aes(reorder(Team_Name.x,av),av)) +
geom_col(color = "tomato", fill = "orange", alpha = .2)+
coord_flip()+
theme_scientific()+
labs(title = "Team Average Overall fgm",x = 'Team', y='Average Overall fgm')
##==================================================================
#Plot on Top4 Last4
TopLowP_TSP_1 = df_all[df_all$Team_Name.y %in% TopLowTeam,]
countorder = TopLowP_TSP_1 %>% group_by(Team_Name.x) %>% summarize(av=mean(overall.x, na.rm=TRUE))
countorder['Rank'] = ifelse(countorder$Team_Name.x %in% c("Celtics","Cavaliers","Warriors","Spurs"), "Top4", "Down4")
#countorder
countorder
## # A tibble: 8 x 3
## Team_Name.x av Rank
## <fct> <dbl> <chr>
## 1 76ers 3.91 Down4
## 2 Cavaliers 4.23 Top4
## 3 Celtics 4.70 Top4
## 4 Lakers 4.56 Down4
## 5 Nets 3.35 Down4
## 6 Spurs 3.60 Top4
## 7 Suns 3.85 Down4
## 8 Warriors 4.00 Top4
ggplot(countorder, aes(reorder(Team_Name.x,av),av,fill = Rank)) +
geom_col()+
coord_flip()+
theme_scientific()+
labs(title = "Team Average Overall fgm",x = 'Team', y='Average Overall fgm')+
scale_colour_colorblind("Rank",
labels=countorder$Rank)
### Part4
#TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs","Lakers","Suns","76ers","Nets")
tweet_content = readr::read_file(paste(location,'Twitter/tweet_content.txt', sep=""))
Spurs_tweet_content = readr::read_file(paste(location,'Twitter/By Team/Spurs.txt',sep=""))
Warriors_tweet_content = readr::read_file(paste(location,'Twitter/By Team/Warriors.txt',sep=""))
Lakers_tweet_content = readr::read_file(paste(location,'Twitter/By Team/Lakers.txt',sep=""))
T76ers_tweet_content = readr::read_file(paste(location,'Twitter/By Team/76ers.txt',sep=""))
My_word_cloud = function(tweet_content,min_freq){
docs = Corpus(VectorSource(tweet_content)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace) %>%
tm_map(PlainTextDocument)
tdm = TermDocumentMatrix(docs) %>%
as.matrix()
content = as.matrix(tdm[,1])
content = as.matrix(content[order(content, decreasing=TRUE),])
print("head(Whole twitter)")
print(head(content))
print("Whole twitter's most occuring words:")
print(head(rownames(content)))
pal <- brewer.pal(9, "YlGnBu")
pal <- pal[-(1:3)]
wordcloud(rownames(content), content, min.freq =min_freq, scale=c(5, .2), random.order = FALSE, random.color = FALSE, colors= pal)
}
## Let's look at what is going on if we plot the twitter!
My_word_cloud(tweet_content = tweet_content,min_freq=150)
## [1] "head(Whole twitter)"
## [,1]
## ontnt 1539
## warriors 1367
## pts 1337
## nba 1238
## spurs 1160
## player 1072
## [1] "Whole twitter's most occuring words:"
## [1] "ontnt" "warriors" "pts" "nba" "spurs" "player"
## Let's look at what is going on WITH TEAM
splited_Spurs = strsplit(T76ers_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
sp = gsub("76ers", "", sp)
My_word_cloud(tweet_content = sp, min_freq = 10)
## [1] "head(Whole twitter)"
## [,1]
## heat 132
## game 98
## nba 87
## embiid 85
## philadelphia 82
## miami 62
## [1] "Whole twitter's most occuring words:"
## [1] "heat" "game" "nba" "embiid"
## [5] "philadelphia" "miami"
#======================================================================
splited_Spurs = strsplit(Spurs_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
sp = gsub("Spurs", "", sp)
sp = gsub("spurs", "", sp)
My_word_cloud(tweet_content = sp, min_freq = 15)
## [1] "head(Whole twitter)"
## [,1]
## man 416
## utd 324
## beat 286
## warriors 236
## new 228
## game 227
## [1] "Whole twitter's most occuring words:"
## [1] "man" "utd" "beat" "warriors" "new" "game"
#======================================================================
splited_Spurs = strsplit(Warriors_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
sp = gsub("Warriors", "", sp)
sp = gsub("warriors", "", sp)
My_word_cloud(tweet_content = sp, min_freq = 15)
## [1] "head(Whole twitter)"
## [,1]
## curry 256
## game 255
## spurs 239
## will 215
## stephen 187
## state 186
## [1] "Whole twitter's most occuring words:"
## [1] "curry" "game" "spurs" "will" "stephen" "state"
#======================================================================
splited_Spurs = strsplit(Lakers_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
sp = gsub("Lakers", "", sp)
sp = gsub("lakers", "", sp)
My_word_cloud(tweet_content = sp, min_freq = 15)
## [1] "head(Whole twitter)"
## [,1]
## kawhi 226
## leonard 131
## lebron 95
## los 92
## trade 92
## nba 80
## [1] "Whole twitter's most occuring words:"
## [1] "kawhi" "leonard" "lebron" "los" "trade" "nba"
pro_76 = read.csv(paste(location,'Twitter/By Team preprocessed_data/preprocessed_76ers.csv', sep=""),
colClasses=c("NULL", NA, NA))
pro_spurs = read.csv(paste(location,'Twitter/By Team preprocessed_data/preprocessed_Spurs.csv', sep=""),
colClasses=c("NULL", NA, NA))
pro_warriours = read.csv(paste(location,'Twitter/By Team preprocessed_data/preprocessed_Warriors.csv', sep=""),
colClasses=c("NULL", NA, NA))
pro_lakers = read.csv(paste(location,'Twitter/By Team preprocessed_data/preprocessed_Lakers.csv', sep=""),
colClasses=c("NULL", NA, NA))
temp_1 = merge(pro_76, pro_spurs,by ='time', all=TRUE)
names(temp_1) = c("time","76er", "spurs")
temp_1 = merge(temp_1, pro_warriours,by ='time', all=TRUE)
names(temp_1) = c("time","76er", "spurs","pro_warriours")
temp_1 = merge(temp_1, pro_lakers,by ='time', all=TRUE)
names(temp_1) = c("time","76er", "spurs","pro_warriours","pro_lakers")
temp_1[temp_1=="[]"]=NA
#mydf = temp_1[sample(nrow(temp_1), 1000), ]## random sample 1000 rows/records
my_missing = function(seg,title){
tidydf <- seg %>%
gather(key, value, -time) %>%
mutate(missing = ifelse(is.na(value), "yes", "no"))
tidydf <- tidydf %>%
mutate(missing2 = ifelse(missing == "yes", 1, 0))
p = ggplot(tidydf, aes(x = fct_reorder(key, -missing2, sum), y = fct_reorder(time, -missing2, sum))) +
geom_tile(color = "white",aes(fill = missing))+
theme(axis.text.x=element_text(),
axis.text.y=element_text(size=2,angle=90))+
labs(title = title,x = 'Team', y='Time')+
scale_fill_manual(values=c("slategray2", "tomato2"))
return(p)
}
###data is too huge seperate based on time to see pattern:
#### 02:00:00-5:00:00
p1 = my_missing(temp_1[1:213,],title = "Missing 02:00:00-5:00:00")
#### 15:30:00-16:00:00
p2 = my_missing(temp_1[214:1002,],title = "Missing 15:30:00-16:00:00")
#### 16:00:00-16:30:00
p3 = my_missing(temp_1[1003:1961,],title = "Missing 16:00:00-16:30:00")
#### 16:30:00-17:00:00
p4 = my_missing(temp_1[1962:2829,],title = "Missing 16:30:00-17:00:00")
#### 17:00:00-17:30:00
p5 = my_missing(temp_1[2829:3708,],title = "Missing 17:00:00-17:30:00")
#### 17:30:00-18:30:00
p6 = my_missing(temp_1[3708:4667,],title = "Missing 17:30:00-18:30:00")
#### 18:30:00-19:00:00
p7 = my_missing(temp_1[4667:5461,],title = "Missing 18:30:00-19:00:00")
grid.arrange(p2,p3,p4,p5,p6, nrow = 1)